Compare arid1b hom and het adult samples.
library("tidyverse")
library("DESeq2") #finding differentially expressed genes
library("cowplot") #arranging plots into grids
library("viridis") #viridis color schemes
library("scales") #use to get color schemes for viridis
library("ggrepel") #repels text labels on plots
library("RColorBrewer") #pick colors
library("DT") #interactive and searchable tables of our GSEA results
library("GSEABase") #functions and methods for Gene Set Enrichment Analysis
library("Biobase") #base functions for bioconductor; required by GSEABase
library("GSVA") #Gene Set Variation Analysis, a non-parametric and unsupervised method for estimating variation of gene set enrichment across samples.
library("gprofiler2") #tools for accessing the GO enrichment results using g:Profiler web resources
library("clusterProfiler") # provides a suite of tools for functional enrichment analysis
library("msigdbr") # access to msigdb collections directly within R
library("enrichplot") # great for making the standard GSEA enrichment plots
library("ontologyIndex") #for parsing obo files
library("BaseSet") #for importing gaf file
library("plotly") #make interactive plots
library("orthogene") #convert gene names between species
library("ChIPseeker") #for annotating and browsing chip-seq data
library("TxDb.Hsapiens.UCSC.hg38.knownGene", pos = .Machine$integer.max) #human gene annotation for chip-seq data
library("EnsDb.Hsapiens.v75", pos = .Machine$integer.max) #human gene annotation for chip-seq data
library("org.Dr.eg.db") #convert ensembl gene names to gene id
library("lattice") #used for making manhattan plot
library("ggpubr") #calculating correlation coefficient
library("colorspace") #colors for heatmap
library("ggraph") #library for making network graphs
library("svglite") #export svgs in loop
library("eulerr") #make venn diagrams#import deseq output for arid1b data
arid1b <- read.csv("arid1badbrain-allhomvshetNOb1hom3COMBAT_allresults_wt_hom-with-normalized.csv")[,2:9]
#import normalized counts for arid1b data
arid1b_genecounts <- read.csv("arid1badbrain-allhomvshetNOb1hom3COMBAT_normalized_reads_gene_list.csv")[,2:19]
#add some information
#limit padj to 1e-10
arid1b <- arid1b %>% mutate(padjbound = ifelse(arid1b$padj < 1e-10, 1e-10, arid1b$padj))
#limit pvalue to 1e-10
arid1b <- arid1b %>% mutate(pvaluebound = ifelse(arid1b$pvalue < 1e-10, 1e-10, arid1b$pvalue))
#import annotations for chromosome and TSS
chromosomeinfo <- readr::read_tsv(file="ncbi_refseqgenes")
#keep chromosome and gene name
chromosomeinfo <- chromosomeinfo %>% dplyr::select(chrom, name2, txStart)
arid1b$chrom <- NA
arid1b$txStart <- NA
#add chromosome and txStart information
for (x in 1:length(arid1b$LLgeneAbbrev)) {
gene <- arid1b$LLgeneAbbrev[x]
chromosomeinfosub <- chromosomeinfo %>% dplyr::filter(name2 == gene)
chr <- chromosomeinfosub$chrom[1]
txStart <- chromosomeinfosub$txStart[1]
arid1b$chrom[x] <- chr
arid1b$txStart[x] <- txStart
}
#add a column with chromosome number
arid1b <- arid1b %>% mutate(chromosomenumber = str_replace_all(chrom, c("chr1"="1", "chr2"="2", "chr3"="3", "chr4"="4", "chr5"="5", "chr6"="6", "chr7"="7", "chr8"="8", "chr9"="9", "chr10"="10", "chr11"="11", "chr12"="12", "chr13"="13", "chr14"="14", "chr15"="15", "chr16"="16", "chr17"="17", "chr18"="18", "chr19"="19", "chr20"="20", "chr21"="21", "chr22"="22", "chr23"="23", "chr24"="24", "chr25"="25")))
arid1b <- distinct(arid1b)
#export csv
write.csv(arid1b, file = "arid1b_annotated.csv")Are differentially expressed genes located at a particular place in the genome?
This function is from: https://genome.sph.umich.edu/wiki/Code_Sample:_Generating_Manhattan_Plots_in_R
#import comparisons (to skip running previous steps)
arid1b <- read.csv("arid1b_annotated.csv")[,2:14]
#function for making manhattan plots
manhattan.plot<-function(chr, pos, pvalue,
sig.level=NA, annotate=NULL, ann.default=list(),
should.thin=T, thin.pos.places=2, thin.logp.places=2,
xlab="Chromosome", ylab=expression(-log[10](p-value)),
col=c("gray","darkgray"), panel.extra=NULL, pch=20, cex=0.8,...) {
if (length(chr)==0) stop("chromosome vector is empty")
if (length(pos)==0) stop("position vector is empty")
if (length(pvalue)==0) stop("pvalue vector is empty")
#make sure we have an ordered factor
if(!is.ordered(chr)) {
chr <- ordered(chr)
} else {
chr <- chr[,drop=T]
}
#make sure positions are in kbp
if (any(pos>1e6)) pos<-pos/1e6;
#calculate absolute genomic position
#from relative chromosomal positions
posmin <- tapply(pos,chr, min);
posmax <- tapply(pos,chr, max);
posshift <- head(c(0,cumsum(posmax)),-1);
names(posshift) <- levels(chr)
genpos <- pos + posshift[chr];
getGenPos<-function(cchr, cpos) {
p<-posshift[as.character(cchr)]+cpos
return(p)
}
#parse annotations
grp <- NULL
ann.settings <- list()
label.default<-list(x="peak",y="peak",adj=NULL, pos=3, offset=0.5,
col=NULL, fontface=NULL, fontsize=NULL, show=F)
parse.label<-function(rawval, groupname) {
r<-list(text=groupname)
if(is.logical(rawval)) {
if(!rawval) {r$show <- F}
} else if (is.character(rawval) || is.expression(rawval)) {
if(nchar(rawval)>=1) {
r$text <- rawval
}
} else if (is.list(rawval)) {
r <- modifyList(r, rawval)
}
return(r)
}
if(!is.null(annotate)) {
if (is.list(annotate)) {
grp <- annotate[[1]]
} else {
grp <- annotate
}
if (!is.factor(grp)) {
grp <- factor(grp)
}
} else {
grp <- factor(rep(1, times=length(pvalue)))
}
ann.settings<-vector("list", length(levels(grp)))
ann.settings[[1]]<-list(pch=pch, col=col, cex=cex, fill=col, label=label.default)
if (length(ann.settings)>1) {
lcols<-trellis.par.get("superpose.symbol")$col
lfills<-trellis.par.get("superpose.symbol")$fill
for(i in 2:length(levels(grp))) {
ann.settings[[i]]<-list(pch=pch,
col=lcols[(i-2) %% length(lcols) +1 ],
fill=lfills[(i-2) %% length(lfills) +1 ],
cex=cex, label=label.default);
ann.settings[[i]]$label$show <- T
}
names(ann.settings)<-levels(grp)
}
for(i in 1:length(ann.settings)) {
if (i>1) {ann.settings[[i]] <- modifyList(ann.settings[[i]], ann.default)}
ann.settings[[i]]$label <- modifyList(ann.settings[[i]]$label,
parse.label(ann.settings[[i]]$label, levels(grp)[i]))
}
if(is.list(annotate) && length(annotate)>1) {
user.cols <- 2:length(annotate)
ann.cols <- c()
if(!is.null(names(annotate[-1])) && all(names(annotate[-1])!="")) {
ann.cols<-match(names(annotate)[-1], names(ann.settings))
} else {
ann.cols<-user.cols-1
}
for(i in seq_along(user.cols)) {
if(!is.null(annotate[[user.cols[i]]]$label)) {
annotate[[user.cols[i]]]$label<-parse.label(annotate[[user.cols[i]]]$label,
levels(grp)[ann.cols[i]])
}
ann.settings[[ann.cols[i]]]<-modifyList(ann.settings[[ann.cols[i]]],
annotate[[user.cols[i]]])
}
}
rm(annotate)
#reduce number of points plotted
if(should.thin) {
thinned <- unique(data.frame(
logp=round(-log10(pvalue),thin.logp.places),
pos=round(genpos,thin.pos.places),
chr=chr,
grp=grp)
)
logp <- thinned$logp
genpos <- thinned$pos
chr <- thinned$chr
grp <- thinned$grp
rm(thinned)
} else {
logp <- -log10(pvalue)
}
rm(pos, pvalue)
gc()
#custom axis to print chromosome names
axis.chr <- function(side,...) {
if(side=="bottom") {
panel.axis(side=side, outside=T,
at=((posmax+posmin)/2+posshift),
labels=levels(chr),
ticks=F, rot=0,
check.overlap=F
)
} else if (side=="top" || side=="right") {
panel.axis(side=side, draw.labels=F, ticks=F);
}
else {
axis.default(side=side,...);
}
}
#make sure the y-lim covers the range (plus a bit more to look nice)
prepanel.chr<-function(x,y,...) {
A<-list();
maxy<-ceiling(max(y, ifelse(!is.na(sig.level), -log10(sig.level), 0)))+.5;
A$ylim=c(0,maxy);
A;
}
xyplot(logp~genpos, chr=chr, groups=grp,
axis=axis.chr, ann.settings=ann.settings,
prepanel=prepanel.chr, scales=list(axs="i"),
panel=function(x, y, ..., getgenpos) {
if(!is.na(sig.level)) {
#add significance line (if requested)
panel.abline(h=-log10(sig.level), lty=2);
}
panel.superpose(x, y, ..., getgenpos=getgenpos);
if(!is.null(panel.extra)) {
panel.extra(x,y, getgenpos, ...)
}
},
panel.groups = function(x,y,..., subscripts, group.number) {
A<-list(...)
#allow for different annotation settings
gs <- ann.settings[[group.number]]
A$col.symbol <- gs$col[(as.numeric(chr[subscripts])-1) %% length(gs$col) + 1]
A$cex <- gs$cex[(as.numeric(chr[subscripts])-1) %% length(gs$cex) + 1]
A$pch <- gs$pch[(as.numeric(chr[subscripts])-1) %% length(gs$pch) + 1]
A$fill <- gs$fill[(as.numeric(chr[subscripts])-1) %% length(gs$fill) + 1]
A$x <- x
A$y <- y
do.call("panel.xyplot", A)
#draw labels (if requested)
if(gs$label$show) {
gt<-gs$label
names(gt)[which(names(gt)=="text")]<-"labels"
gt$show<-NULL
if(is.character(gt$x) | is.character(gt$y)) {
peak = which.max(y)
center = mean(range(x))
if (is.character(gt$x)) {
if(gt$x=="peak") {gt$x<-x[peak]}
if(gt$x=="center") {gt$x<-center}
}
if (is.character(gt$y)) {
if(gt$y=="peak") {gt$y<-y[peak]}
}
}
if(is.list(gt$x)) {
gt$x<-A$getgenpos(gt$x[[1]],gt$x[[2]])
}
do.call("panel.text", gt)
}
},
xlab=xlab, ylab=ylab,
panel.extra=panel.extra, getgenpos=getGenPos, ...
);
}
#select columns to include in manhattan plot
myTopHits.df <- arid1b %>% dplyr::select(chromosomenumber, txStart, pvaluebound)
#filter to only include chromosomes 1-25
myTopHits.df <- myTopHits.df %>% dplyr::filter(chromosomenumber %in% 1:25)
#make chromosomes plotted in order
myTopHits.df$chromosomenumber <- factor(myTopHits.df$chromosomenumber, levels = 1:25)
#omit NA values
myTopHits.df <- na.omit(myTopHits.df)
#make colors
manhattancol <- c(replicate(9, c("lightgrey", "darkgrey")), "lightgrey", "#21908CFF", replicate(4, c("lightgrey", "darkgrey")))
manhattan <- myTopHits.df
#make manhattan plot
manhattan.plot(manhattan$chromosomenumber, manhattan$txStart, manhattan$pvaluebound, should.thin=F, col=manhattancol)#import annotations for chromosome and gene name
chromosomeinfo <- readr::read_tsv(file="ncbi_refseqgenes")
#keep chromosome and gene name
chromosomeinfo <- chromosomeinfo %>% dplyr::select(chrom, name2)
#change column names
colnames(chromosomeinfo) <- c("chromosome", "genename")
#select only chromosomes 1-25
chromosomeinfo <- chromosomeinfo %>% dplyr::filter(chromosome %in% paste("chr", 1:25, sep=""))
#keep only distinct rows
chromosomeinfo <- distinct(chromosomeinfo)# Perform GSEA using clusterProfiler
#filter to only one comparison
GSEAgenes <- arid1b
#keep only the columns we need for GSEA
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#sort by abs(foldchange)
mydata.df.sub$log2FoldChange <- abs(mydata.df.sub$log2FoldChange)
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA using the 'GSEA' function from clusterProfiler
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=chromosomeinfo, verbose=FALSE, scoreType="pos", maxGSSize=2000, pvalueCutoff = 1, seed=TRUE)
myGSEA.df <- as_tibble(myGSEA.res@result)
# view results as an interactive table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)No chromosome is enriched for differentially expressed genes in arid1b mutants.
#import normalized counts for arid1b data
arid1b_genecounts <- read.csv("arid1badbrain-allhomvshetNOb1hom3COMBAT_normalized_reads_gene_list.csv")[,2:19]
#pick out a genes to plot
geneofinterest <- unique(c("rln3a", "pmchl", "ucn3l", "pmch", "plxnb2b", "plxna2", "sema3aa", "per1a", "per3", "bcl6ab", "ctbp2a", "etv4", "hbp1", "jund", "klf7b", "rcor1", "sox9b", "tox3", "cremb", "csrnp1a", "dmbx1a", "foxp2", "kdm2ba", "onecutl", "pitx2", "shox2", "stat1a", "sall2", "srebf1", "stox1", "tcf15", "usf2", "zbtb22a", "zbtb47b", "znf362b", "znf395a", "znf740a", "ppp1caa", "ppp2r1ba", "ppp3ccb", "arid1b", "rln3a", "XLOC_033136", "cav1", "gabra6b", "pmch", "pmchl", "si:ch211-237l4.6", "kcnh4b", "dedd1", "kif5ab", "adcy7", "abcg1", "smc1a", "uacab", "acap3b", "per1a", "per3", "ucn3l", "cxadr", "s100b", "fkbp5", "ppp1caa", "ppp2r1ba", "ppp3ccb", "prickle2a", "foxp2", "fkbp5", "htr1aa", "isl1", "rln3b", "per1b", "cry2", "nr1d2b", "roraa", "rorab", "rorca", "rorcb"))
#make new columns with mean counts of het samples
genecountslogfc <- arid1b_genecounts
genecountslogfc$arid1bavg <- rowMeans(genecountslogfc[,c(4,6,8,10,12,13,14,15)])
#divide columns by mean counts of hets
genecountslogfc <- genecountslogfc %>% mutate(across(colnames(genecountslogfc)[4:18], function(x) x/arid1bavg))
#pull out data for a select gene
generepdata <- genecountslogfc %>% dplyr::filter(LLgeneAbbrev %in% geneofinterest)
#get rid of unnecessary columns
generepdata <- generepdata[,c(1,4:18)]
#pivot longer to make tidy
generepdata <- generepdata %>% pivot_longer(!LLgeneAbbrev, names_to = "sample", values_to = "foldchange")
#add condition information based on sampleName
generepdata$genotype <- generepdata$sample
#substitute condition name based on replicate
generepdata <- generepdata %>% mutate(genotype = str_replace_all(genotype, c("^het.*"="het", "^hom.*"="hom")))
#order data on plot
generepdata$genotype <- factor(generepdata$genotype, levels=c("het", "hom"))
#set colors
colors <- c("#472D7BFF", "#21908CFF")
#make an empty list
plot_list = list()
#make all the plots
for (z in 1:length(geneofinterest)) {
genedata2dpf <- generepdata %>% dplyr::filter(LLgeneAbbrev == geneofinterest[z])
plot <- ggplot(genedata2dpf, aes(fill=genotype, y=foldchange, x=genotype)) +
geom_bar(position="dodge", stat="summary") +
geom_point(position = position_dodge(width = .9)) +
labs(title = paste(geneofinterest[z])) +
ylab("fold change") +
theme_bw() +
scale_fill_manual(values = colors)
plot_list[[z]] <- plot
}
# Save plots to svg Makes a separate file for each plot.
for (i in 1:length(geneofinterest)) {
file_name = paste("arid1b_foldchange_", geneofinterest[i], ".svg", sep="")
svglite(file_name)
print(plot_list[[i]])
dev.off()
} #subset data to arid1b
myTopHits.df <- arid1b
#edit padj bound to e-20
myTopHits.df <- myTopHits.df %>% dplyr::mutate(padjbound = ifelse(padj < 1e-20, 1e-20, padj))
#add column about whether chr is 20
myTopHits.df <- myTopHits.df %>% dplyr::mutate(chrom = replace_na(chrom, "none"))
myTopHits.df <- myTopHits.df %>% mutate(chromosome = ifelse(chrom == "chr20", "chromosome 20", "other chromosome"))
#list of hormones
myTopHits.hormones <- myTopHits.df %>% dplyr::filter(LLgeneAbbrev %in% c("rln3a", "pmchl", "ucn3l", "pmch"))
myTopHits.hormones$category <- "hormones"
myTopHits.semaphorin <- myTopHits.df %>% dplyr::filter(LLgeneAbbrev %in% c("plxnb2b", "plxna2", "sema3aa"))
myTopHits.semaphorin$category <- "semaphorin pathway"
myTopHits.sleep <- myTopHits.df %>% dplyr::filter(LLgeneAbbrev %in% c("per1a", "per3", "per1b", "cry2", "nr1d2b", "roraa", "rorab", "rorca", "rorcb"))
myTopHits.sleep$category <- "sleep"
myTopHits.transcription <- myTopHits.df %>% dplyr::filter(LLgeneAbbrev %in% c("bcl6ab", "ctbp2a", "etv4", "hbp1", "jund", "klf7b", "rcor1", "sox9b", "tox3", "cremb", "csrnp1a", "dmbx1a", "foxp2", "kdm2ba", "onecutl", "pitx2", "shox2", "stat1a", "sall2", "srebf1", "stox1", "tcf15", "usf2", "zbtb22a", "zbtb47b", "znf362b", "znf395a", "znf740a"))
myTopHits.transcription$category <- "regulation of transcription"
myTopHits.phosphatase <- myTopHits.df %>% dplyr::filter(LLgeneAbbrev %in% c("ppp1caa", "ppp2r1ba", "ppp3ccb"))
myTopHits.phosphatase$category <- "protein phosphatase"
#genes to label
myTopHits.labels <- rbind(myTopHits.hormones, myTopHits.semaphorin, myTopHits.sleep, myTopHits.transcription, myTopHits.phosphatase)
#change order that points are plotted
myTopHits.labels <- myTopHits.labels %>% arrange(match(category, c("regulation of transcription", "sleep", "hormones", "semaphorin pathway", "protein phosphatase")), desc(category))
genestolabel <- c("arid1b", "rln3a", "XLOC_033136", "cav1", "gabra6b", "pmch", "pmchl", "si:ch211-237l4.6", "kcnh4b", "dedd1", "kif5ab", "adcy7", "abcg1", "smc1a", "uacab", "acap3b", "per1a", "per3", "ucn3l", "cxadr", "s100b", "fkbp5", "ppp1caa", "ppp2r1ba", "ppp3ccb", "prickle2a", "foxp2", "fkbp5")
#subset to only include labeled genes
myTopHits.labels.all <- myTopHits.df %>% dplyr::filter(LLgeneAbbrev %in% genestolabel)
#make all points other
myTopHits.df <- myTopHits.df %>% dplyr::mutate(mutation = "other")
#make the plot
arid1b_volcano <- ggplot() +
annotate("rect", xmin = 1, xmax = 2.5, ymin = -log10(0.05), ymax = 6.5,
alpha=.15, fill="grey") +
annotate("rect", xmin = -1, xmax = -2.5, ymin = -log10(0.05), ymax = 6.5,
alpha=.15, fill="grey") +
geom_point(data=myTopHits.df, aes(y=-log10(padjbound), x=log2FoldChange, shape = chromosome, color = mutation, text = paste("Symbol:", LLgeneAbbrev)), size=2) +
geom_point(data=myTopHits.labels, aes(y=-log10(padjbound), x = log2FoldChange, color=category, shape=chromosome), size=2, show.legend = T) +
theme_bw() +
coord_cartesian(xlim = c(-2.6, 2.6), ylim = c(-0.5, 7), expand = FALSE) +
ylab("-log10(padj)") +
xlab("log2 fold change") +
geom_label_repel(data=myTopHits.labels.all, aes(x=log2FoldChange, y=-log10(padjbound), label=LLgeneAbbrev), force = 3, nudge_y = 1, size = 2.5, max.overlaps = Inf, show.legend = FALSE, color = "black") + #label selected genes
theme_bw() +
scale_color_manual(values = c("#AADC32FF", "darkgrey", "#27AD81FF", "#472D7BFF","gold", "deepskyblue", "orange", "#D697FF"), name = "pathway") +
scale_shape_manual(values=c(4, 16))
arid1b_volcanoGet a list of the top 300 genes dysregulated in arid1b by padj to put into DAVID.
#select arid1b genes that aren't on chromosome 20
arid1b_genesfordavid <- arid1b %>% dplyr::filter(chromosomenumber != 20)
#sort by pvalue
arid1b_genesfordavid <- arid1b_genesfordavid %>% dplyr::arrange(padj)
#get names of top 300 genes
arid1b_genesfordavid <- arid1b_genesfordavid$LLgeneAbbrev[1:300]
write.csv(arid1b_genesfordavid, "arid1b_output/arid1b_genesfordavid.csv")
# use the 'gost' function from the gprofiler2 package to run GO enrichment analysis
#genes shared between comparisons - both up and downregulated
gost.res.homwtall <- gost(arid1b_genesfordavid, organism = "drerio", correction_method = "fdr", significant = TRUE, evcodes = TRUE, ordered_query = TRUE)
#export to csv
write.csv(apply(gost.res.homwtall$result,2,as.character), file="arid1b_output/arid1b_GO-analysis.csv")
# produce a manhattan plot of enriched GO terms
gostplot(gost.res.homwtall, interactive = T, capped = T) # choose a specific msigdb collection/subcollection
dr_gsea_c5 <- msigdbr(species = "Danio rerio", category = "C5") %>% # choose msigdb collection of interest
dplyr::select(gs_name, gene_symbol) #just get the columns corresponding to signature name and gene symbols of genes in each signature
# pull out data for arid1b
GSEAgenes <- arid1b %>% dplyr::filter(chromosomenumber != 20)
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#get rid of duplicates
mydata.df.sub <- mydata.df.sub %>% dplyr::filter(LLgeneAbbrev %in% mydata.df.sub$LLgeneAbbrev[!duplicated(mydata.df.sub$LLgeneAbbrev)])
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# run GSEA with C5 in arid1b
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=dr_gsea_c5, verbose=FALSE, seed=TRUE)
#convert to DF
myGSEA.df <- as_tibble(myGSEA.res)
#look at top terms
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)CNStermsGSEA <- read.csv("CNStermsGSEA.csv")[,2:3]
headtermsGSEA <- read.csv("headtermsGSEA.csv")[,2:3]
fullsetGSEA <- read.csv("fullsetGSEA.csv")[,2:3]
# Pull out just the columns corresponding to gene symbols and LogFC for at least one pairwise comparison for the enrichment analysis
GSEAgenes <- arid1b %>% dplyr::filter(chromosomenumber != 20)
mydata.df.sub <- dplyr::select(GSEAgenes, LLgeneAbbrev, log2FoldChange)
#get rid of duplicates
mydata.df.sub <- mydata.df.sub %>% dplyr::filter(LLgeneAbbrev %in% mydata.df.sub$LLgeneAbbrev[!duplicated(mydata.df.sub$LLgeneAbbrev)])
# construct a named vector
mydata.gsea <- mydata.df.sub$log2FoldChange
names(mydata.gsea) <- as.character(mydata.df.sub$LLgeneAbbrev)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
#options for doing GSEA using ZFA
#CNStermsGSEA
#headtermsGSEA
#fullsetGSEA
# run GSEA with CNS terms
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE=CNStermsGSEA, verbose=FALSE, seed=TRUE, minGSSize = 80)
myGSEA.df <- as_tibble(myGSEA.res@result)
#export file
write.csv(myGSEA.df, file="arid1b_output/arid1b_CNS_GSEA.csv")
# view results as an interactive table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:10), digits=2)#make network plot
myGSEA.res <- pairwise_termsim(myGSEA.res)
arid1bnetwork <- emapplot(myGSEA.res, color="NES", categorySize="p.adjust", showCategory = 2)
#get data of out network plot
arid1bnetwork <- ggplot_build(arid1bnetwork)
networkdata <- arid1bnetwork$plot$data
#get color
myheatcolors3 <- brewer.pal(name="RdBu", n=11)
#make network plot
arid1b_network <- ggraph(networkdata) +
geom_edge_link(alpha=.8, aes_(width=~I(width)), colour='darkgrey') +
geom_node_point(aes(colour = color, size=size)) +
geom_node_text(aes(label=name), repel=TRUE) +
theme_void() +
scale_color_gradientn(colors = myheatcolors3, limit=c(-1.5,1.5), name="NES")
arid1b_network#export 700x500
#make heatmap
p3 <- heatplot(myGSEA.res, foldChange=mydata.gsea, showCategory = 2) + ggplot2::coord_flip()
#get data of out heatmap
p3 <- ggplot_build(p3)
heatmapdata <- p3$plot$data
#filter to only include fold changes > 0.5
genestoinclude <- heatmapdata %>% dplyr::filter(foldChange > 0.2 | foldChange < -0.2)
heatmapdata <- heatmapdata %>% dplyr::filter(Gene %in% genestoinclude$Gene)
#use pivot wider to make untidy table
heatmapdata <- heatmapdata %>% pivot_wider(names_from = categoryID, values_from = foldChange, values_fill=NA)
#make matrix
heatmapmatrix <- as.matrix(heatmapdata[,2:3])
rownames(heatmapmatrix) <- heatmapdata$Gene
#turn matrix into 1s and 0s and cluster
heatmapmatrix<-ifelse(abs(heatmapmatrix) > 0,1,0)
heatmapmatrix[is.na(heatmapmatrix)] = 0
#cluster rows by pearson correlation
#hc <- hclust(as.dist(1-cor(heatmapmatrix, method="spearman")), method="average") #cluster columns by spearman correlation
#turn heatmap data back into tidy table
heatmapdata <- heatmapdata %>% pivot_longer(!Gene, names_to = "categoryID", values_to = "foldChange")
#change order of data to be plotted
heatmapdata$Gene <- factor(heatmapdata$Gene, levels = rev(unique(heatmapdata$Gene)))
#rename dataframe columns
colnames(heatmapdata) <- c("Gene", "categoryID", "log2foldchange")
singlecellheatmap <- ggplot(heatmapdata, aes(x = Gene, y = categoryID, fill = log2foldchange)) +
geom_tile(color="black") +
theme_classic() +
scale_fill_continuous_divergingx(palette = 'RdBu', mid = 0, na.value="white") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), axis.title.x=element_blank(), axis.title.y=element_blank())
singlecellheatmap + ggplot2::coord_flip()Packages and versions necessary to reproduce the results in this report.
## R version 4.3.1 (2023-06-16 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] eulerr_7.0.0
## [2] svglite_2.1.2
## [3] ggraph_2.1.0
## [4] colorspace_2.1-0
## [5] ggpubr_0.6.0
## [6] lattice_0.22-5
## [7] org.Dr.eg.db_3.18.0
## [8] ensembldb_2.26.0
## [9] AnnotationFilter_1.26.0
## [10] GenomicFeatures_1.54.1
## [11] ChIPseeker_1.38.0
## [12] orthogene_1.8.0
## [13] plotly_4.10.3
## [14] BaseSet_0.9.0
## [15] ontologyIndex_2.11
## [16] enrichplot_1.22.0
## [17] msigdbr_7.5.1
## [18] clusterProfiler_4.10.0
## [19] gprofiler2_0.2.2
## [20] GSVA_1.50.0
## [21] GSEABase_1.64.0
## [22] graph_1.80.0
## [23] annotate_1.80.0
## [24] XML_3.99-0.14
## [25] AnnotationDbi_1.64.0
## [26] DT_0.30
## [27] RColorBrewer_1.1-3
## [28] ggrepel_0.9.4
## [29] scales_1.2.1
## [30] viridis_0.6.4
## [31] viridisLite_0.4.2
## [32] cowplot_1.1.1
## [33] DESeq2_1.42.0
## [34] SummarizedExperiment_1.32.0
## [35] Biobase_2.62.0
## [36] MatrixGenerics_1.14.0
## [37] matrixStats_1.1.0
## [38] GenomicRanges_1.54.1
## [39] GenomeInfoDb_1.38.0
## [40] IRanges_2.36.0
## [41] S4Vectors_0.40.1
## [42] BiocGenerics_0.48.0
## [43] lubridate_1.9.3
## [44] forcats_1.0.0
## [45] stringr_1.5.0
## [46] dplyr_1.1.3
## [47] purrr_1.0.2
## [48] readr_2.1.4
## [49] tidyr_1.3.0
## [50] tibble_3.2.1
## [51] ggplot2_3.4.4
## [52] tidyverse_2.0.0
## [53] knitr_1.45
## [54] tinytex_0.48
## [55] rmarkdown_2.25
## [56] TxDb.Hsapiens.UCSC.hg38.knownGene_3.18.0
## [57] EnsDb.Hsapiens.v75_2.99.0
##
## loaded via a namespace (and not attached):
## [1] ProtGenerics_1.34.0
## [2] fs_1.6.3
## [3] bitops_1.0-7
## [4] HDO.db_0.99.1
## [5] httr_1.4.7
## [6] tools_4.3.1
## [7] backports_1.4.1
## [8] utf8_1.2.4
## [9] R6_2.5.1
## [10] HDF5Array_1.30.0
## [11] lazyeval_0.2.2
## [12] rhdf5filters_1.14.0
## [13] withr_2.5.2
## [14] prettyunits_1.2.0
## [15] gridExtra_2.3
## [16] cli_3.6.1
## [17] scatterpie_0.2.1
## [18] labeling_0.4.3
## [19] sass_0.4.7
## [20] systemfonts_1.0.5
## [21] Rsamtools_2.18.0
## [22] yulab.utils_0.1.0
## [23] gson_0.1.0
## [24] DOSE_3.28.0
## [25] plotrix_3.8-2
## [26] rstudioapi_0.15.0
## [27] RSQLite_2.3.2
## [28] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [29] generics_0.1.3
## [30] gridGraphics_0.5-1
## [31] BiocIO_1.12.0
## [32] crosstalk_1.2.0
## [33] vroom_1.6.4
## [34] gtools_3.9.4
## [35] car_3.1-2
## [36] homologene_1.4.68.19.3.27
## [37] GO.db_3.18.0
## [38] Matrix_1.6-1.1
## [39] fansi_1.0.5
## [40] abind_1.4-5
## [41] lifecycle_1.0.4
## [42] yaml_2.3.7
## [43] carData_3.0-5
## [44] gplots_3.1.3
## [45] rhdf5_2.46.0
## [46] qvalue_2.34.0
## [47] SparseArray_1.2.0
## [48] BiocFileCache_2.10.1
## [49] grid_4.3.1
## [50] blob_1.2.4
## [51] promises_1.2.1
## [52] crayon_1.5.2
## [53] beachmat_2.18.0
## [54] KEGGREST_1.42.0
## [55] pillar_1.9.0
## [56] fgsea_1.28.0
## [57] boot_1.3-28.1
## [58] rjson_0.2.21
## [59] codetools_0.2-19
## [60] fastmatch_1.1-4
## [61] glue_1.6.2
## [62] ggfun_0.1.3
## [63] data.table_1.14.8
## [64] vctrs_0.6.4
## [65] png_0.1-8
## [66] treeio_1.26.0
## [67] gtable_0.3.4
## [68] cachem_1.0.8
## [69] xfun_0.41
## [70] S4Arrays_1.2.0
## [71] mime_0.12
## [72] tidygraph_1.2.3
## [73] SingleCellExperiment_1.24.0
## [74] interactiveDisplayBase_1.40.0
## [75] ellipsis_0.3.2
## [76] nlme_3.1-163
## [77] ggtree_3.10.0
## [78] bit64_4.0.5
## [79] progress_1.2.2
## [80] filelock_1.0.2
## [81] bslib_0.5.1
## [82] irlba_2.3.5.1
## [83] KernSmooth_2.23-22
## [84] DBI_1.1.3
## [85] tidyselect_1.2.0
## [86] bit_4.0.5
## [87] compiler_4.3.1
## [88] curl_5.1.0
## [89] xml2_1.3.5
## [90] DelayedArray_0.28.0
## [91] shadowtext_0.1.2
## [92] rtracklayer_1.62.0
## [93] caTools_1.18.2
## [94] rappdirs_0.3.3
## [95] digest_0.6.33
## [96] XVector_0.42.0
## [97] htmltools_0.5.6.1
## [98] pkgconfig_2.0.3
## [99] sparseMatrixStats_1.14.0
## [100] highr_0.10
## [101] dbplyr_2.4.0
## [102] fastmap_1.1.1
## [103] rlang_1.1.1
## [104] htmlwidgets_1.6.2
## [105] shiny_1.7.5.1
## [106] DelayedMatrixStats_1.24.0
## [107] farver_2.1.1
## [108] jquerylib_0.1.4
## [109] jsonlite_1.8.7
## [110] BiocParallel_1.36.0
## [111] GOSemSim_2.28.0
## [112] BiocSingular_1.18.0
## [113] RCurl_1.98-1.12
## [114] magrittr_2.0.3
## [115] GenomeInfoDbData_1.2.11
## [116] ggplotify_0.1.2
## [117] patchwork_1.1.3
## [118] Rhdf5lib_1.24.0
## [119] munsell_0.5.0
## [120] Rcpp_1.0.11
## [121] ggnewscale_0.4.9
## [122] ape_5.7-1
## [123] babelgene_22.9
## [124] stringi_1.8.1
## [125] zlibbioc_1.48.0
## [126] MASS_7.3-60
## [127] AnnotationHub_3.10.0
## [128] plyr_1.8.9
## [129] parallel_4.3.1
## [130] HPO.db_0.99.2
## [131] Biostrings_2.70.1
## [132] graphlayouts_1.0.1
## [133] splines_4.3.1
## [134] hms_1.1.3
## [135] locfit_1.5-9.8
## [136] igraph_1.5.1
## [137] ggsignif_0.6.4
## [138] reshape2_1.4.4
## [139] biomaRt_2.58.0
## [140] ScaledMatrix_1.10.0
## [141] BiocVersion_3.18.0
## [142] evaluate_0.23
## [143] BiocManager_1.30.22
## [144] tzdb_0.4.0
## [145] tweenr_2.0.2
## [146] httpuv_1.6.12
## [147] grr_0.9.5
## [148] polyclip_1.10-6
## [149] ggforce_0.4.1
## [150] rsvd_1.0.5
## [151] broom_1.0.5
## [152] xtable_1.8-4
## [153] restfulr_0.0.15
## [154] tidytree_0.4.5
## [155] MPO.db_0.99.7
## [156] rstatix_0.7.2
## [157] later_1.3.1
## [158] snow_0.4-4
## [159] aplot_0.2.2
## [160] GenomicAlignments_1.38.0
## [161] memoise_2.0.1
## [162] timechange_0.2.0